Seawater samples were collected for (1) microbial metagenomics and (2) physico-chemical data (temperature, salinity, and particulate/dissolved nutrient concentrations) from 48 offshore reefs across the length of the GBR, within the Great Barrier Reef Microbial Genomics Database (GBR-MGD) initiative by Australia’s Integrated Marine Observing System (IMOS). This sampling was done alongside ongoing in situ health surveys by the Australian Institute of Marine Science Long-Term Monitoring Program (AIMS-LTMP).
The code below was used to create a map showing the 48 IMOS GBR-MGD sites, by combining these two tutorials: 1. https://open-aims.github.io/gisaimsr/articles/examples.html 2. https://r-spatial.org/r/2018/10/25/ggplot2-sf-2.html
# Importing the coordinates
map_coords <- read.csv("/home/markoterzin/Documents/PhD/Thesis/Chapter_2/Data_analysis/Seawater/testing_Tom_Jenkins_script/all_IMOS-MGD_seawater_subset/Metadata_files/MARKO_for_eReefs_Lats_Longs.csv")
# View(map_coords)
# Now converting from data frame into sf format
map_coords <- st_as_sf(map_coords,
coords = c("lon", "lat"),
remove = FALSE,
crs = 4283, # this is the reference code for the CRS system GDA94, used by dataaimsr & gisaimsr R packages
agr = "constant")
# I will now add the info on Sampling trip - this will be needed when plotting
# This is the final metadata file, with average values of env. measurements (averaged per Reef site)
map_reef_names_and_trip <- read.csv(file = "/home/markoterzin/Documents/PhD/Thesis/Chapter_2/Data_analysis/Seawater/00_FINAL_PIPELINE/01_Preparation_of_Environmental_Metadata/metadata_with_reef_names_maps.csv") %>%
dplyr::select(REEF_NAME, Sampling_trip)
### 2 ### Renaming the sampling trips to include dates, and make sure they are ordered alphabetically
# First trip
map_reef_names_and_trip$Sampling_trip <- gsub("First", # String to search for
"Trip_01_Nov-Dec_2019", # Replace with this
as.character(map_reef_names_and_trip$Sampling_trip)) # Column to search in
# Second trip
map_reef_names_and_trip$Sampling_trip <- gsub("Second", # String to search for
"Trip_02_January_2020", # Replace with this
as.character(map_reef_names_and_trip$Sampling_trip)) # Column to search in
# Third trip
map_reef_names_and_trip$Sampling_trip <- gsub("Third", # String to search for
"Trip_03_February_2020", # Replace with this
as.character(map_reef_names_and_trip$Sampling_trip)) # Column to search in
# Fourth trip
map_reef_names_and_trip$Sampling_trip <- gsub("Fourth", # String to search for
"Trip_04_July_2020", # Replace with this
as.character(map_reef_names_and_trip$Sampling_trip)) # Column to search in
# Merging with 'Sampling trip' info
map_coords <- left_join(map_coords, map_reef_names_and_trip, by = c("name" = "REEF_NAME"))
#########################
# Now plotting
# And now defining colors for the map
cols_map <- c("tomato3", # Enclosed Coastal
"salmon3", # Macro Tidal Enclosed Coastal
"pink3", # Macro Tidal Open Coastal
"peachpuff", # Midshelf
"lightsteelblue", # Offshore
"lightcoral") # Open coastal
# Plotting without the mainland - otherwise I am just losing precious space
gbr_no_mainland <- gbr_feat %>%
dplyr::filter(FEAT_NAME != "Mainland")
# ------------------------------------------------ #
# Overlaying IMOS-MGD sites - only as points first #
# ------------------------------------------------ #
col.per.trip <- factor(map_coords$Sampling_trip, levels = c("Trip_01_Nov-Dec_2019",
"Trip_02_January_2020",
"Trip_03_February_2020",
"Trip_04_July_2020"))
colors <- c("indianred", # Sampling trip 1
"indianred4", # Sampling trip 2
"red3", # Sampling trip 3
"slateblue") # Sampling trip 4
names(colors) <- c("Trip_01_Nov-Dec_2019",
"Trip_02_January_2020",
"Trip_03_February_2020",
"Trip_04_July_2020")
# Importing city coordinates
oz_cities <- read.csv("~/Documents/PhD/Thesis/Chapter_2/Data_analysis/Seawater/testing_Tom_Jenkins_script/all_IMOS-MGD_seawater_subset/Input_files/oz_cities.csv")
# Plot
IMOS_MGD_dots_trip = ggplot(data = gbr_feat) + # gbr_feat is a crs object from the AIMS GIS packages
# geom_sf(data = gbr_bounds, fill = "darkred", colour = NA) + # Coloring the boundaries of the GBR MP
# Include the region info too (the 3 lines below)
# geom_sf(data = nrm_regions,
# mapping = aes(fill = NAME), lwd = 0.01) +
# scale_fill_brewer(name = "Region", palette = "Set3") +
# geom_sf(data = wbodies,
# mapping = aes(fill = MarineWate), # Commenting out as I don't need water bodies in color here
# lwd = 0.01) +
geom_sf(data = gbr_feat, lwd = 0.01,
fill = "seashell2"
) +
##############################################################################################
# Adding our sites as dots
##############################################################################################
geom_sf(data = map_coords, # Needs to be a data frame, requires 'geometry'
aes(color = Sampling_trip), # Coloring sites per sampling trip
# alpha = 0.6, # This is to ensure
show.legend = "point") +
coord_sf(xlim = c(142, 154), ylim = c(-10, -27)) +
geom_text_repel(data = oz_cities, aes(x = Longitude, y = Latitude, label = Town), # repel to make sure the names do not overlap
fontface = "bold", # to have the reef names in bold
size=3.2,
col = 'black',
nudge_x = c(-0.5, # Townsville
-0.9, # Brisbane
-0.5, # Cairns
-0.8, # Cooktown
-0.5, # Mackay
-0.7), # Bundaberg
nudge_y = c(-0.5, # Townsville
0.5, # Brisbane
-0.5, # Cairns
-0.5, # Cooktown
-0.5, # Mackay
-0.5), # Bundaberg
)+
scale_color_manual(name = "Dates of Sampling Transects", values=colors)+
theme_classic() +
theme(panel.background = element_rect(fill = "lightblue3",
colour = "lightblue3",
size = 0.5, linetype = "solid")) +
labs(x = "Longitude",
y = "Latitude",
title = "IMOS-MGD",
subtitle = "Microbial Genomics Database sites") +
# scale_fill_manual(name = "Type of Water Body", values = cols_map) +
theme(legend.direction = "vertical", legend.box = "vertical")
IMOS_MGD_dots_trip
Field sampling design for the GBR-MGD (Great Barrier Reef Microbial Genomics Database) dataset. (above) Seawater was collected from 48 offshore GBR reef sites for microbial community metagenomic sequencing and water chemistry analysis over four trips between November 2019 and July 2020. Reef sites are coloured in red or blue tones to denote trips that occurred during the austral summer (wet season) or austral winter (dry season), respectively. (bellow) A more detailed map showing the name of each reef site, and their membership to either offshore (41 reefs) or mid-shelf (7 reefs) waters. No inshore sites were sampled.
To show reefs in more detail, we also plot a close-up of sites within each trip.
# But need to split map_coords file per trip
map_coords_trip1 <- filter(map_coords, Sampling_trip=="Trip_01_Nov-Dec_2019")
map_coords_trip2 <- filter(map_coords, Sampling_trip=="Trip_02_January_2020")
map_coords_trip3 <- filter(map_coords, Sampling_trip=="Trip_03_February_2020")
map_coords_trip4 <- filter(map_coords, Sampling_trip=="Trip_04_July_2020")
IMOS_MGD_trip1 = ggplot(data = gbr_feat) + # gbr_feat is a crs object from the AIMS GIS packages
# geom_sf(data = gbr_bounds, fill = "darkred", colour = NA) + # Coloring the boundaries of the GBR MP
geom_sf() +
# geom_sf(data = wbodies,
# mapping = aes(fill = MarineWate), # Commenting out as I don't need water bodies in color here
# lwd = 0.01) +
geom_sf(data = gbr_feat,
lwd = 0.01,
fill = "seashell2"
) +
##############################################################################################
# Adding our sites as dots
##############################################################################################
geom_sf(data = map_coords_trip1, # Needs to be a data frame, requires 'geometry'
aes(color = Sampling_trip), # Coloring sites per sampling trip
show.legend = "point") +
geom_text_repel(data = map_coords_trip1, aes(x = lon, y = lat, label = name, colour = Sampling_trip), # repel to make sure the names do not overlap
fontface = "bold", # to have the reef names in bold
size=3.2,
segment.color = "black",
segment.alpha = 0.6,
segment.size = 0.1,
nudge_x = c(1.2, # MCSWEENEY REEF
1.2, # MONSOON REEF, - sign means it will move to the left
1.2, # 11-049
1.2, # 11-162
0.9, # MANTIS REEF
0.6, # LAGOON REEF
0.4, # DAVIE REEF
-0.2, # CORBETT REEF
0.4, # 13-124
-0.1), # SANBANK 1 REEF
# This should be 48 times, for our 48 sites
nudge_y = c(0.2, # MCSWEENEY REEF
0.1, # MONSOON REEF
0.1, # 11-049, - sign means it will go down
0.1, # 11-162
0.2, # MANTIS REEF
-0.3, # LAGOON REEF
0.2, # DAVIE REEF
-0.8, # CORBETT REEF
0.3, # 13-124
-0.6), ) + # SANDBANK 1 REEF
coord_sf(xlim = c(143, 147), ylim = c(-11, -16.5)) +
scale_color_manual(name = "Sampling trip", values=c("indianred")) + # color I am using for Sampling trip 1
theme_classic() +
theme(panel.background = element_rect(fill = "lightblue3",
colour = "lightblue3",
size = 0.5, linetype = "solid")) +
labs(x = "Longitude",
y = "Latitude",
title = "IMOS Microbial Genomics Database sites",
subtitle = "Trip 1 (Nov-Dec 2019)")
# scale_fill_manual(name = "Type of Water Body", values = cols_map) +
# theme(legend.direction = "vertical", legend.box = "vertical")
IMOS_MGD_trip1
IMOS_MGD_trip2 = ggplot(data = gbr_feat) + # gbr_feat is a crs object from the AIMS GIS packages
# geom_sf(data = gbr_bounds, fill = "darkred", colour = NA) + # Coloring the boundaries of the GBR MP
geom_sf() +
# geom_sf(data = wbodies,
# mapping = aes(fill = MarineWate), # Commenting out as I don't need water bodies in color here
# lwd = 0.01) +
geom_sf(data = gbr_feat,
lwd = 0.01,
fill = "seashell2"
) +
##############################################################################################
# Adding our sites as dots
##############################################################################################
geom_sf(data = map_coords_trip2, # Needs to be a data frame, requires 'geometry'
aes(color = Sampling_trip), # Coloring sites per sampling trip
show.legend = "point") +
geom_text_repel(data = map_coords_trip2, aes(x = lon, y = lat, label = name, colour = Sampling_trip), # repel to make sure the names do not overlap
fontface = "bold", # to have the reef names in bold
size=3.2,
segment.color = "black",
segment.alpha = 0.6,
segment.size = 0.1,
nudge_x = c(-0.1, # FAIRFAX REEF
-0.4, # HOSKYN REEF
0.3, # BOULT REEF
0.3, # MASTHEAD REEF
-0.2, # ERSKINE REEF
0.4, # BROOMFIELD REEF
0.1, # 21-550
-0.5, # 22-084
0.4, # CHINAMAN REEF
-0.1, # 21-580
0.2, # SMALL LAGOON REEF
-0.3), # NORTH REEF
# This should be 48 times, for our 48 sites
nudge_y = c(-0.1, # FAIRFAX REEF
-0.1, # HOSKYN REEF
0.2, # BOULT REEF
-0.1, # MASTHEAD REEF
0.2, # ERSKINE REEF
0.2, # BROOMFIELD REEF
0.2, # 21-550
-0.3, # 22-084
0.2, # CHINAMAN REEF
-0.6, # 21-580
0.4, # SMALL LAGOON REEF
0.1), # NORTH REEF
) + # SANDBANK 1 REEF
coord_sf(xlim = c(151, 153), ylim = c(-21.5, -24)) +
scale_color_manual(name = "Sampling trip", values=c("indianred4")) + # color I am using for Sampling trip 1
theme_classic() +
theme(panel.background = element_rect(fill = "lightblue3",
colour = "lightblue3",
size = 0.5, linetype = "solid")) +
labs(x = "Longitude",
y = "Latitude",
title = "IMOS Microbial Genomics Database sites",
subtitle = "Trip 2 (January 2020)")
# scale_fill_manual(name = "Type of Water Body", values = cols_map) +
# theme(legend.direction = "vertical", legend.box = "vertical")
IMOS_MGD_trip2
IMOS_MGD_trip3 = ggplot(data = gbr_feat) + # gbr_feat is a crs object from the AIMS GIS packages
# geom_sf(data = gbr_bounds, fill = "darkred", colour = NA) + # Coloring the boundaries of the GBR MP
geom_sf() +
# geom_sf(data = wbodies,
# mapping = aes(fill = MarineWate), # Commenting out as I don't need water bodies in color here
# lwd = 0.01) +
geom_sf(data = gbr_feat,
lwd = 0.01,
fill = "seashell2"
) +
##############################################################################################
# Adding our sites as dots
##############################################################################################
geom_sf(data = map_coords_trip3, # Needs to be a data frame, requires 'geometry'
aes(color = Sampling_trip), # Coloring sites per sampling trip
show.legend = "point") +
geom_text_repel(data = map_coords_trip3, aes(x = lon, y = lat, label = name, colour = Sampling_trip), # repel to make sure the names do not overlap
fontface = "bold", # to have the reef names in bold
size=3.2,
segment.color = "black",
segment.alpha = 0.6,
segment.size = 0.1,
nudge_x = c(#0.3, # ST CRISPIN
0.3, # AGINCOURT1 REEF
0.3, # HASTINGS REEF
0.3, # ARLINGTON REEF
0.4, # THETFORD REEF
0.4, # MOORE REEF
0.3, # HEDLEY REEF
0.3, # MCCULLOCH REEF
0.4, # PEART REEF
0.4, # FEATHER REEF
0.1, # FARQUAHARSON REEF
0.3), # TAYLOR REEF
# This should be 48 times, for our 48 sites
nudge_y = c(#-0.1, # ST CRISPIN
0.2, # AGINCOURT1 REEF
0.2, # HASTINGS REEF
0.1, # ARLINGTON REEF
0.2, # THETFORD REEF
0.1, # MOORE REEF
0.1, # HEDLEY REEF
0.2, # MCCULLOCH REEF
0.1, # PEART REEF
-0.1, # FEATHER REEF
0.2, # FARQUAHARSON REEF
-0.1), # TAYLOR REEF
) +
coord_sf(xlim = c(145.4, 147), ylim = c(-15.8, -18)) +
scale_color_manual(name = "Sampling trip", values=c("red"))+
theme_classic() +
theme(panel.background = element_rect(fill = "lightblue3",
colour = "lightblue3",
size = 0.5,
linetype = "solid")) +
labs(x = "Longitude",
y = "Latitude",
title = "IMOS Microbial Genomics Database sites",
subtitle = "Trip 3 (February 2020)") +
# scale_fill_manual(name = "Type of Water Body", values = cols_map) +
theme(legend.direction = "vertical", legend.box = "vertical")
IMOS_MGD_trip3
IMOS_MGD_trip4 = ggplot(data = gbr_feat) + # gbr_feat is a crs object from the AIMS GIS packages
# geom_sf(data = gbr_bounds, fill = "darkred", colour = NA) + # Coloring the boundaries of the GBR MP
geom_sf() +
# geom_sf(data = wbodies,
# mapping = aes(fill = MarineWate), # Commenting out as I don't need water bodies in color here
# lwd = 0.01) +
geom_sf(data = gbr_feat,
lwd = 0.01,
fill = "seashell2"
) +
##############################################################################################
# Adding our sites as dots
##############################################################################################
geom_sf(data = map_coords_trip4, # Needs to be a data frame, requires 'geometry'
aes(color = Sampling_trip), # Coloring sites per sampling trip
show.legend = "point") +
geom_text_repel(data = map_coords_trip4, aes(x = lon, y = lat, label = name, colour = Sampling_trip), # repel to make sure the names do not overlap
fontface = "bold", # to have the reef names in bold
size=3.2,
segment.color = "black",
segment.alpha = 0.6,
segment.size = 0.1,
nudge_x = c(-0.2, # LITTLE KELSO REEF
-0.2, # KELSO REEF
0.5, # ROXBURGH REEF
0.2, # FORE&AFT REEF
0.2, # RIB REEF
-0.1, # JOHN BREWER REEF
0.1, # MYRMIDON REEF
-0.3, # CHICKEN REEF
0.3, # KNIFE REEF
0.2, # FORK REEF
0.2, # LYNCHS REEF
-0.1, # CENTIPEDE REEF
-0.1, # GRUB REEF
-0.2), # HELIX REEF
# This should be 48 times, for our 48 sites
nudge_y = c(-0.2, # LITTLE KELSO REEF
-0.1, # KELSO REEF
0.1, # ROXBURGH REEF
0, # FORE&AFT REEF
0.1, # RIB REEF
-0.2, # JOHN BREWER REEF
0.1, # MYRMIDON REEF
-0.3, # CHICKEN REEF
0.1, # KNIFE REEF
0, # FORK REEF
-0.2, # LYNCHS REEF
-0.1, # CENTIPEDE REEF
-0.1, # GRUB REEF
-0.2), # HELIX REEF
) +
coord_sf(xlim = c(146.8, 148), ylim = c(-18.1, -19)) +
scale_color_manual(name = "Sampling trip", values=c("slateblue"))+
theme_classic() +
theme(panel.background = element_rect(fill = "lightblue3",
colour = "lightblue3",
size = 0.5,
linetype = "solid")) +
labs(x = "Longitude",
y = "Latitude",
title = "IMOS Microbial Genomics Database sites",
subtitle = "Trip 4 (July 2020)") +
# scale_fill_manual(name = "Type of Water Body", values = cols_map) +
theme(legend.direction = "vertical", legend.box = "vertical")
IMOS_MGD_trip4
wrangling <- read.csv("~/Documents/PhD/Thesis/Chapter_2/Data_analysis/Seawater/testing_Igor_Antti_R_script/all_seawater_0-05/metadata_wrangling.csv")
WQ_methods <- read.csv("~/Documents/PhD/Thesis/Chapter_2/Data_analysis/Seawater/Getting_the_environmental_data/Data_from_the_water_quality_team/FINAL_QCd_files/WQ_IDs_IMOS-MGD_only.csv")
# And this is one of the WQ spreadsheets - link between WQ IDs and our Reef names
# Selecting only the columns of interest
WQ_methods <- dplyr::select(WQ_methods, one_of(c("REEF_NAME",
"WQ_Station_Name",
"Collection_method", # diving or from boat
"Sample_collection_start")))#,
# We decided not to include the metrics bellow
# "Swell_direction",
# "Swell_height",
# "Wind_direction",
#"Wind_speed" )))
# Additional data from the LTMP trips
LTMP <- read.csv("/home/markoterzin/Documents/PhD/Thesis/Chapter_2/Data_analysis/Seawater/testing_Igor_Antti_R_script/all_seawater_0-05/metadata_IMOS_from_Mike.csv") %>%
dplyr::select("REEF_NAME", "Sampling_trip", "GBR_sector", "SAMPLE_DATE", "Lat", "Long")
### 2 ### Renaming the sampling trips to include dates, and make sure they are ordered alphabetically
# First trip
LTMP$Sampling_trip <- gsub("First", # String to search for
"Trip_01_Nov-Dec_2019", # Replace with this
as.character(LTMP$Sampling_trip)) # Column to search in
# Second trip
LTMP$Sampling_trip <- gsub("Second", # String to search for
"Trip_02_January_2020", # Replace with this
as.character(LTMP$Sampling_trip)) # Column to search in
# Third trip
LTMP$Sampling_trip <- gsub("Third", # String to search for
"Trip_03_February_2020", # Replace with this
as.character(LTMP$Sampling_trip)) # Column to search in
# Fourth trip
LTMP$Sampling_trip <- gsub("Fourth", # String to search for
"Trip_04_July_2020", # Replace with this
as.character(LTMP$Sampling_trip)) # Column to search in
# Joining - first step
metadata <- left_join(wrangling, LTMP)
# In this step I added the Sample IDs to the LTMP data
metadata <- left_join(metadata, WQ_methods)
# And in here the info from the WQ team
# importing the actual water chemistry measurements
WQ_Result_Report <- read.csv("~/Documents/PhD/Thesis/Chapter_2/Data_analysis/Seawater/Getting_the_environmental_data/Data_from_the_water_quality_team/FINAL_QCd_files/MBM_Result_Report_R_Dec_2022.csv")
# Removing Temperature and Salinity for now - too many missing values
WQ_Result_Report <- dplyr::select(WQ_Result_Report, one_of(c("WQ_Station_Name",
"DEPTH",
"Chlorophyll_a_.µg.L.",
"Phaeophytin_a_.µg.L.",
"PN_.µM.",
"POC_.µM.",
"PP_.µM.",
"DOC_.µM.",
"PO4_.µM.",
"NH4_.µM.",
"NO2_.µM.",
"NO3_.µM.",
"Si_.µM.",
"TDN_.µM.",
"TDP_.µM.",
"TSS_.mg.L.")))
# "Salinity",
# "Temperature.C..")))
# Now adding the Reef_name info
wq.all.reps.for.pca <- left_join(WQ_methods, WQ_Result_Report)
# I also need the info on Sampling trips - to color the groups on the PCA. I will also add the data from the research vessel at this stage - Temperature, Salinity, Turbidity, Fluorescence
reefs_trips <- read.csv("/home/markoterzin/Documents/PhD/Thesis/Chapter_2/Data_analysis/Seawater/00_FINAL_PIPELINE/01_Preparation_of_Environmental_Metadata/metadata_with_reef_names.csv")
# But I only want reef names and their corresponding trips for now!
reefs_trips <- reefs_trips[,c(1,3)]
### 2 ### Renaming the sampling trips to include dates, and make sure they are ordered alphabetically
# First trip
reefs_trips$Sampling_trip <- gsub("First", # String to search for
"Trip_01_Nov-Dec_2019", # Replace with this
as.character(reefs_trips$Sampling_trip)) # Column to search in
# Second trip
reefs_trips$Sampling_trip <- gsub("Second", # String to search for
"Trip_02_January_2020", # Replace with this
as.character(reefs_trips$Sampling_trip)) # Column to search in
# Third trip
reefs_trips$Sampling_trip <- gsub("Third", # String to search for
"Trip_03_February_2020", # Replace with this
as.character(reefs_trips$Sampling_trip)) # Column to search in
# Fourth trip
reefs_trips$Sampling_trip <- gsub("Fourth", # String to search for
"Trip_04_July_2020", # Replace with this
as.character(reefs_trips$Sampling_trip)) # Column to search in
# Now adding the metadata from the RV
vessel_metadata <- read.csv("/home/markoterzin/Documents/PhD/FROM/from_Patrick/IMOS-MGD_additional_metadata/GBR-Genomics-Database_Seawater-Illumina-Reads.csv")
# Keeping only: Temperature, Salinity, Turbidity, Fluorescence
vessel_metadata <- dplyr::select(vessel_metadata, one_of(c("REEF_NAME",
"SEAWATER_TEMPERATURE_2.5m_RV",
"SALINITY_2.5m_RV",
# "TURBIDITY_2.5m_RV",
"FLUORESCENCE_2.5m_RV")))
# Joining now
reefs_trips <- left_join(reefs_trips,
vessel_metadata)
wq.all.reps.for.pca <- left_join(wq.all.reps.for.pca,
reefs_trips)
PCA was applied on a physico-chemical dataset containing 17 variables, including: 1. 14 water chemistry variables: ammonia (NH4), nitrite (NO2), nitrate (NO3), total dissolved nitrogen (TDN), phosphate (PO4), total dissolved phosphorus (TDP), dissolved organic carbon (DOC), silicate (Si), total suspended solids (TSS), chlorophyll a (Chl-a), phaeophytin a (Phaeo), particulate organic carbon (POC), particulate nitrogen (PN), and particulate phosphorus (PP). For each of these 14 water chemistry variables, triplicate 5 L seawater samples were collected using Niskin bottles for analysis of water chemistry variables, at each of the 48 reefs. 2. temperature, fluorescence, and salinity measurements from the underway sampling systems on the RV Solander and RV Cape Ferguson, with intake depths for underway systems were 1.9 m (RV Cape Ferguson) and 2.5 m (RV Solander). For these three measurements, one value per reef site was recorded.
The mixOmics function tune.pca() calculates the cumulative proportion of explained variance for a large number of principal components (here we set ncomp = 10). A screeplot of the proportion of explained variance relative to the total amount of variance in the data for each principal component is output.
# In PCA, we first count the number of missing values, as this will tell us whether PCA will be solved using SVD (no missing values) or iterative NIPALS (with missing values) internally in the mixOmics function pca().
sum(is.na(wq.all.reps.for.pca[,c(6:14, 16:23)]))
## [1] 17
# Number of NAs
## [1] 17
# Since we have some missing values, the iterative NIPALS will be called inside pca()
tune.pca.WQ <- tune.pca(wq.all.reps.for.pca[,c(6:14, 16:23)], ncomp = 10, scale = TRUE)
plot(tune.pca.WQ)
Screeplot from the PCA performed on the IMOS GBR-MGD physico-chemical data: Amount of explained variance for each principal component is shown. From the numerical output (shown bellow in tabular format), we observe that the first two principal components explain 60.31% of the total variance. The rule of thumb for choosing the number of PCA components is not so much to set a hard threshold based on the cumulative proportion of explained variance (as this is data-dependent), but to observe when a drop, or elbow, appears on the screeplot. The elbow indicates that the remaining variance is spread over many principal components and is not relevant in obtaining a low-dimensional ‘snapshot’ of the data. Based on this, we chose to keep two PCA dimensions.
# Numerical output
pca.wq.all.reps <- pca(wq.all.reps.for.pca[,c(6:14, 16:23)], # getting the numerical values only
ncomp = 10,
center = TRUE,
scale = TRUE)
# Explained variance per PCA component
knitr::kable(pca.wq.all.reps$prop_expl_var$X, caption = "The proportion of explained variance per each PCA component is:")
| x | |
|---|---|
| PC1 | 0.4066099 |
| PC2 | 0.1964915 |
| PC3 | 0.0910662 |
| PC4 | 0.0643839 |
| PC5 | 0.0470814 |
| PC6 | 0.0422856 |
| PC7 | 0.0305387 |
| PC8 | 0.0288760 |
| PC9 | 0.0231195 |
| PC10 | 0.0200885 |
# The cumulative proportion of variance explained by each PCA component
knitr::kable(pca.wq.all.reps$cum.var, caption = "The cumulative proportion of variance explained by each PCA component")
| x | |
|---|---|
| PC1 | 0.4066099 |
| PC2 | 0.6031014 |
| PC3 | 0.6941676 |
| PC4 | 0.7585515 |
| PC5 | 0.8056329 |
| PC6 | 0.8479185 |
| PC7 | 0.8784572 |
| PC8 | 0.9073332 |
| PC9 | 0.9304527 |
| PC10 | 0.9505412 |
PCA_WQ_sample_plot <- plotIndiv(pca.wq.all.reps,
comp = c(1, 2),
group = wq.all.reps.for.pca$Sampling_trip,
# ind.names = wq.all.reps.for.pca$REEF_NAME,
ellipse = T,
col.per.group =c("indianred", # Sampling trip 1
"indianred4", # Sampling trip 2
"red3", # Sampling trip 3
"slateblue"), # Sampling trip 4
legend = TRUE,
title = 'WQ Metadata all reps, PCA comp 1 - 2')
PCA_WQ_biplot <- biplot(pca.wq.all.reps,
comp = c(1, 2),
group = wq.all.reps.for.pca$Sampling_trip,
# ind.names = wq.all.reps.for.pca$REEF_NAME,
col.per.group =c("indianred", # Sampling trip 1
"indianred4", # Sampling trip 2
"red3", # Sampling trip 3
"slateblue"), # Sampling trip 4
legend = TRUE,
legend.title = "Sampling trip",
title = 'PCA biplot for WQ Metadata all reps, PCA comp 1 - 2')
The PCA results suggest that our water chemistry measurements from across the GBR were largely driven by seasonality, while geography had a weaker influence. Chemistry profiles of samples collected in early austral summer were comparable despite being >1500 km apart in the far north (Cape Grenville and Princess Charlotte bay sectors) and far south (Swains and Capricorn Bunker sectors) of the GBR, whereas samples collected during the peaks of austral summer and winter were the most distinct although they were geographically close in the central GBR (~200 km apart, Cairns and Cooktown / Lizard island sectors for austral summer samples, and Innisfail and Townsville sectors for austral winter samples). Further, we observe that summer trips 1-3 were characterised by elevated temperature and higher concentrations of dissolved and particulate nutrients, apart frpm TDP and phosphate which were elevated during winter.
However, we did not show reef names in either of the PCA plots as there is an overlap between data points (and hence the text is not readable), and also in these PCA visualisations, we lose context of raw values. This information was added with a heatmap (to compare physico-chemical metrics across sites) and with boxplots (which show the raw physico-chemical measurements).
We first collapsed the data to a mean/median value because for each of the 17 environmental metrics we computed the median value per reef site as the number of Niskin deployments differed for molecular (four replicates) and water chemistry (three replicates) sampling.
# Making the heatmap of LTMP data
WQ_heatmap <- metadata[,24:40] %>% # I am only choosing columns with median values
scale(center = TRUE, scale = TRUE) %>% # I wand the values to be scaled
as.data.frame() %>% # Converting back to data frame - ggplot needs this
rownames_to_column("Sample_ID") %>% # Setting rownames as Col 1 - will need this for melting
reshape2::melt() %>% # Getting the long format - this is what geom_tile needs
left_join(metadata[, c(1,2)] # adding back the REEF NAME and SAMPLING TRIP vars
%>% rownames_to_column("Sample_ID")
) %>% # Need to convert row names to Column 1 and give Sample_ID as name, because I am joining those with the same ID
ggplot(aes(x = REEF_NAME, y = variable, fill = value)) +
geom_tile() + # Plotting the heatmap here
scale_fill_gradient2(low = "#075AFF",
mid = "#FFFFCC",
high = "#FF0000") + # The coloring scheme - red for high vals, blue for low
facet_wrap(~Sampling_trip, scales = "free_x", ncol = 4) + # now facetting reef sites based on the Sampling trip
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 8))
WQ_heatmap
The heatmap shows the level of change in all 17 physico-chemical variables (y axis) across the reef sites (x axis), grouped within their corresponding sampling trip. Environmental measurements were centered (median = 0) and scaled (standard deviation (SD) = 1) across reef sites, and values that deviate from the median (0) were shown in red (> median) and blue (< median). This heatmap was combined in Inkscape with the PCA visualisation for physico-chemical data to re-introduce the context of reef sites, which were not visualised in the PCA.
# Median is the default in ggplot2
reshape2::melt(wq.all.reps.for.pca[,c(1, # Reef name
6:19, # All numerical vals
21, # Temperature
22, # Salinity
23, # Fluorescence
20)]) %>% # Sampling trip
ggplot(aes(y = value,
x = Sampling_trip,
fill = Sampling_trip),
alpha=0.8) +
geom_boxplot(#outlier.colour="red",
outlier.shape=8,
outlier.size=4) +
geom_jitter(alpha = 0.6,
size = 0.8) +
stat_summary(fun=mean,
geom="point",
shape=20,
size=0.8,
color="seagreen1",
fill="seagreen1") + # Plotting the mean as a green dot!
facet_grid(rows = vars(variable),
cols = vars(Sampling_trip),
scales = "free"
) +
scale_fill_manual(values = c("indianred", # Sampling trip 1
"indianred4", # Sampling trip 2
"red3", # Sampling trip 3
"slateblue") # Sampling trip 4
) +
labs(y = "WQ metrics",
x = "Reef sites",
title = "Boxplots for WQ metrics (Median & Mean)"
) +
theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 8))
FIG CAP TO BE ADDED.
Getting the numerical summary of physico-chemical variables
# Data needs to be in long format
wq_median_per_trip <- reshape2::melt(wq.all.reps.for.pca[,c(1, # Reef name
6:19, # All numerical vals
21, # Temperature
22, # Salinity
23, # Fluorescence
20)]) %>% # Sampling_trip
as.data.frame() %>%
group_by(Sampling_trip, variable) %>%
# Now computing mean and SD
dplyr::summarize( # This tutorial for troubleshooting! https://stackoverflow.com/questions/46661461/calculate-mean-by-group-using-dplyr-package
median=round(median(value, na.rm=TRUE),
digits = 2)
) %>%
reshape2::dcast(variable~Sampling_trip)
# Showing as table
knitr::kable(wq_median_per_trip, caption = "Median for all 17 physico-chemical metrics, collapsed across the four sampling trips.")
| variable | Trip_01_Nov-Dec_2019 | Trip_02_January_2020 | Trip_03_February_2020 | Trip_04_July_2020 |
|---|---|---|---|---|
| Chlorophyll_a_.µg.L. | 0.17 | 0.15 | 0.23 | 0.10 |
| Phaeophytin_a_.µg.L. | 0.17 | 0.17 | 0.36 | 0.10 |
| PN_.µM. | 1.23 | 1.19 | 1.30 | 0.50 |
| POC_.µM. | 7.12 | 7.77 | 9.74 | 3.52 |
| PP_.µM. | 0.04 | 0.04 | 0.06 | 0.02 |
| DOC_.µM. | 83.75 | 79.58 | 65.83 | 69.53 |
| PO4_.µM. | 0.05 | 0.04 | 0.02 | 0.09 |
| NH4_.µM. | 0.32 | 0.52 | 0.68 | 0.11 |
| NO2_.µM. | 0.02 | 0.04 | 0.03 | 0.01 |
| NO3_.µM. | 0.21 | 0.34 | 0.21 | 0.20 |
| Si_.µM. | 1.38 | 1.14 | 2.00 | 1.86 |
| TDN_.µM. | 5.43 | 6.53 | 5.70 | 5.28 |
| TDP_.µM. | 0.20 | 0.23 | 0.16 | 0.26 |
| TSS_.mg.L. | 0.43 | 0.13 | 0.08 | 0.05 |
| SEAWATER_TEMPERATURE_2.5m_RV | 27.78 | 27.16 | 30.16 | 24.40 |
| SALINITY_2.5m_RV | 35.27 | 35.55 | 34.72 | 35.16 |
| FLUORESCENCE_2.5m_RV | 0.10 | 0.11 | 0.32 | 0.09 |
# Data needs to be in long format
wq_mean_per_trip <- reshape2::melt(wq.all.reps.for.pca[,c(1, # Reef name
6:19, # All numerical vals
21, # Temperature
22, # Salinity
23, # Fluorescence
20)]) %>% # Sampling_trip
as.data.frame() %>%
group_by(Sampling_trip, variable) %>%
# Now computing mean and SD
dplyr::summarize( # This tutorial for troubleshooting! https://stackoverflow.com/questions/46661461/calculate-mean-by-group-using-dplyr-package
mean=round(mean(value, na.rm=TRUE),
digits = 2)
) %>%
reshape2::dcast(variable~Sampling_trip)
# Showing as table
knitr::kable(wq_mean_per_trip, caption = "Mean for all 17 physico-chemical metrics, collapsed across the four sampling trips.")
| variable | Trip_01_Nov-Dec_2019 | Trip_02_January_2020 | Trip_03_February_2020 | Trip_04_July_2020 |
|---|---|---|---|---|
| Chlorophyll_a_.µg.L. | 0.18 | 0.16 | 0.32 | 0.11 |
| Phaeophytin_a_.µg.L. | 0.18 | 0.20 | 0.36 | 0.10 |
| PN_.µM. | 1.23 | 1.27 | 1.32 | 0.50 |
| POC_.µM. | 8.06 | 7.60 | 9.95 | 3.66 |
| PP_.µM. | 0.05 | 0.05 | 0.07 | 0.02 |
| DOC_.µM. | 84.51 | 81.92 | 67.22 | 69.30 |
| PO4_.µM. | 0.05 | 0.04 | 0.02 | 0.10 |
| NH4_.µM. | 0.39 | 0.58 | 0.74 | 0.12 |
| NO2_.µM. | 0.03 | 0.04 | 0.04 | 0.01 |
| NO3_.µM. | 0.30 | 0.33 | 0.35 | 0.23 |
| Si_.µM. | 1.41 | 1.30 | 2.10 | 1.79 |
| TDN_.µM. | 5.47 | 6.62 | 5.64 | 5.18 |
| TDP_.µM. | 0.20 | 0.23 | 0.16 | 0.26 |
| TSS_.mg.L. | 0.48 | 0.15 | 0.35 | 0.12 |
| SEAWATER_TEMPERATURE_2.5m_RV | 27.78 | 27.13 | 30.01 | 24.22 |
| SALINITY_2.5m_RV | 35.35 | 35.52 | 34.71 | 35.16 |
| FLUORESCENCE_2.5m_RV | 0.10 | 0.10 | 0.34 | 0.13 |
# Data needs to be in long format
wq_sd_per_trip <- reshape2::melt(wq.all.reps.for.pca[,c(1, # Reef name
6:19, # All numerical vals
21, # Temperature
22, # Salinity
23, # Fluorescence
20)]) %>% # Sampling_trip
as.data.frame() %>%
group_by(Sampling_trip, variable) %>%
# Now computing mean and SD
dplyr::summarize( # This tutorial for troubleshooting! https://stackoverflow.com/questions/46661461/calculate-mean-by-group-using-dplyr-package
sd=round(sd(value, na.rm=TRUE),
digits = 2)
) %>%
reshape2::dcast(variable~Sampling_trip)
# Showing as table
knitr::kable(wq_sd_per_trip, caption = "SD for all 17 physico-chemical metrics, collapsed across the four sampling trips.")
| variable | Trip_01_Nov-Dec_2019 | Trip_02_January_2020 | Trip_03_February_2020 | Trip_04_July_2020 |
|---|---|---|---|---|
| Chlorophyll_a_.µg.L. | 0.06 | 0.08 | 0.18 | 0.03 |
| Phaeophytin_a_.µg.L. | 0.04 | 0.08 | 0.15 | 0.02 |
| PN_.µM. | 0.35 | 0.46 | 0.22 | 0.10 |
| POC_.µM. | 2.86 | 1.89 | 2.29 | 1.00 |
| PP_.µM. | 0.02 | 0.02 | 0.03 | 0.01 |
| DOC_.µM. | 5.99 | 9.89 | 4.60 | 4.67 |
| PO4_.µM. | 0.03 | 0.02 | 0.02 | 0.02 |
| NH4_.µM. | 0.15 | 0.27 | 0.44 | 0.06 |
| NO2_.µM. | 0.02 | 0.01 | 0.02 | 0.00 |
| NO3_.µM. | 0.25 | 0.15 | 0.31 | 0.16 |
| Si_.µM. | 0.30 | 0.44 | 0.55 | 0.65 |
| TDN_.µM. | 0.83 | 0.82 | 0.72 | 0.75 |
| TDP_.µM. | 0.03 | 0.04 | 0.03 | 0.02 |
| TSS_.mg.L. | 0.41 | 0.15 | 0.52 | 0.10 |
| SEAWATER_TEMPERATURE_2.5m_RV | 0.43 | 0.61 | 0.39 | 0.95 |
| SALINITY_2.5m_RV | 0.21 | 0.17 | 0.05 | 0.04 |
| FLUORESCENCE_2.5m_RV | 0.01 | 0.02 | 0.05 | 0.12 |
# Exporting Medians as csv
write.csv(wq_median_per_trip, file = "/home/markoterzin/Documents/PhD/Thesis/Chapter_2/Paper_drafts/the_analysis_is_finalised/Supplementary_Tables/Table_WQ_Median_per_trip.csv", quote = F, row.names = F)
# Exporting Means as csv
write.csv(wq_mean_per_trip, file = "/home/markoterzin/Documents/PhD/Thesis/Chapter_2/Paper_drafts/the_analysis_is_finalised/Supplementary_Tables/Table_WQ_mean_per_trip.csv", quote = F, row.names = F)
# Exporting SD as csv
write.csv(wq_sd_per_trip, file = "/home/markoterzin/Documents/PhD/Thesis/Chapter_2/Paper_drafts/the_analysis_is_finalised/Supplementary_Tables/Table_WQ_SD_per_trip.csv", quote = F, row.names = F)
# The csv files on median and sd values were merged manually to make the Table 1 in the main text of the manuscript
Raw counts were exported from MEGAN as biom files separately for (1) microbial taxonomy (genus level as the lowest category) and for (2) microbial functions (GO terms), and subsequently imported into R using the phyloseq R package. These biom files were combined with the metadata file to create 2 phyloseq objects (for taxa and genes), which have then undergone various filtering steps.
### Importing the biom tables, exported from MEGAN
### Taxonomy info | at 'Genus' level
megan_genus <- import_biom("/home/markoterzin/Documents/PhD/Thesis/Chapter_2/Data_analysis/Seawater/testing_Igor_Antti_R_script/All_seawater_full_dataset/input_files/IMOS_MGD_Megan_biom_files_191_samples_and_Neg_controls_July_2024/IMOS-MGD_Seawater_full_dataset_Genera_191_samples_Neg_controls_July_2024.biom")
### Functional info | GO terms
megan_GO_5 <- import_biom("/home/markoterzin/Documents/PhD/Thesis/Chapter_2/Data_analysis/Seawater/testing_Igor_Antti_R_script/All_seawater_full_dataset/input_files/IMOS_MGD_Megan_biom_files_191_samples_and_Neg_controls_July_2024/IMOS-MGD_Seawater_GOs_Rank5_191_samples_Neg_controls_July_2024.biom")
# This one has 7476 GO terms
megan_GO_4 <- import_biom("/home/markoterzin/Documents/PhD/Thesis/Chapter_2/Data_analysis/Seawater/testing_Igor_Antti_R_script/All_seawater_full_dataset/input_files/IMOS_MGD_Megan_biom_files_191_samples_and_Neg_controls_July_2024/IMOS-MGD_Seawater_GOs_Rank4_191_samples_Neg_controls_July_2024.biom")
# This one has 5257 GO terms
megan_GO_3 <- import_biom("/home/markoterzin/Documents/PhD/Thesis/Chapter_2/Data_analysis/Seawater/testing_Igor_Antti_R_script/All_seawater_full_dataset/input_files/IMOS_MGD_Megan_biom_files_191_samples_and_Neg_controls_July_2024/IMOS-MGD_Seawater_GOs_Rank3_191_samples_Neg_controls_July_2024.biom")
# This one has 705 GO terms
# Let's just modify the metadata file a bit to include neg controls as well
metadata_neg_controls <- left_join(megan_genus@otu_table %>%
t() %>%
as.data.frame() %>%
rownames_to_column("Sample_ID") %>%
dplyr::select("Sample_ID"),
metadata %>%
rownames_to_column("Sample_ID")) %>%
column_to_rownames("Sample_ID")
# Merging:
sample_data(megan_genus) <- sample_data(metadata_neg_controls)
sample_data(megan_GO_5) <- sample_data(metadata_neg_controls)
sample_data(megan_GO_4) <- sample_data(metadata_neg_controls)
sample_data(megan_GO_3) <- sample_data(metadata_neg_controls)
# Checking the phyloseq objects
megan_genus
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 2066 taxa and 207 samples ]
## sample_data() Sample Data: [ 207 samples by 40 sample variables ]
## tax_table() Taxonomy Table: [ 2066 taxa by 7 taxonomic ranks ]
megan_GO_5
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 7476 taxa and 207 samples ]
## sample_data() Sample Data: [ 207 samples by 40 sample variables ]
## tax_table() Taxonomy Table: [ 7476 taxa by 6 taxonomic ranks ]
megan_GO_4
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 5257 taxa and 207 samples ]
## sample_data() Sample Data: [ 207 samples by 40 sample variables ]
## tax_table() Taxonomy Table: [ 5257 taxa by 4 taxonomic ranks ]
megan_GO_3
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 706 taxa and 207 samples ]
## sample_data() Sample Data: [ 207 samples by 40 sample variables ]
## tax_table() Taxonomy Table: [ 706 taxa by 3 taxonomic ranks ]
# But I want to filter out the PF samples, and Broomfield rep 2 (because the sequencing was repeated for this one)
megan_genus <- subset_samples(megan_genus, sample_names(megan_genus)!='Lynchs-PF-1_S107_R1' &
sample_names(megan_genus)!='Lynchs-PF-2_S108_R1' &
sample_names(megan_genus)!='Lynchs-PF-3_S109_R1' &
sample_names(megan_genus)!='Lynchs-PF-4_S110_R1' &
sample_names(megan_genus)!='Myrmidon-PF-1_S111_R1' &
sample_names(megan_genus)!='Myrmidon-PF-2_S112_R1' &
sample_names(megan_genus)!='Myrmidon-PF-3_S113_R1' &
sample_names(megan_genus)!='Myrmidon-PF-4_S114_R1' &
sample_names(megan_genus)!='Rib-PF-1_S103_R1' &
sample_names(megan_genus)!='Rib-PF-2_S104_R1' &
sample_names(megan_genus)!='Rib-PF-3_S105_R1' &
sample_names(megan_genus)!='Rib-PF-4_S106_R1' &
sample_names(megan_genus)!='Broomfield-2_S50_R1')
megan_GO_5 <- subset_samples(megan_GO_5, sample_names(megan_GO_5)!='Lynchs-PF-1_S107_R1' &
sample_names(megan_GO_5)!='Lynchs-PF-2_S108_R1' &
sample_names(megan_GO_5)!='Lynchs-PF-3_S109_R1' &
sample_names(megan_GO_5)!='Lynchs-PF-4_S110_R1' &
sample_names(megan_GO_5)!='Myrmidon-PF-1_S111_R1' &
sample_names(megan_GO_5)!='Myrmidon-PF-2_S112_R1' &
sample_names(megan_GO_5)!='Myrmidon-PF-3_S113_R1' &
sample_names(megan_GO_5)!='Myrmidon-PF-4_S114_R1' &
sample_names(megan_GO_5)!='Rib-PF-1_S103_R1' &
sample_names(megan_GO_5)!='Rib-PF-2_S104_R1' &
sample_names(megan_GO_5)!='Rib-PF-3_S105_R1' &
sample_names(megan_GO_5)!='Rib-PF-4_S106_R1' &
sample_names(megan_GO_5)!='Broomfield-2_S50_R1')
megan_GO_4 <- subset_samples(megan_GO_4, sample_names(megan_GO_4)!='Lynchs-PF-1_S107_R1' &
sample_names(megan_GO_4)!='Lynchs-PF-2_S108_R1' &
sample_names(megan_GO_4)!='Lynchs-PF-3_S109_R1' &
sample_names(megan_GO_4)!='Lynchs-PF-4_S110_R1' &
sample_names(megan_GO_4)!='Myrmidon-PF-1_S111_R1' &
sample_names(megan_GO_4)!='Myrmidon-PF-2_S112_R1' &
sample_names(megan_GO_4)!='Myrmidon-PF-3_S113_R1' &
sample_names(megan_GO_4)!='Myrmidon-PF-4_S114_R1' &
sample_names(megan_GO_4)!='Rib-PF-1_S103_R1' &
sample_names(megan_GO_4)!='Rib-PF-2_S104_R1' &
sample_names(megan_GO_4)!='Rib-PF-3_S105_R1' &
sample_names(megan_GO_4)!='Rib-PF-4_S106_R1' &
sample_names(megan_GO_4)!='Broomfield-2_S50_R1')
megan_GO_3 <- subset_samples(megan_GO_3, sample_names(megan_GO_3)!='Lynchs-PF-1_S107_R1' &
sample_names(megan_GO_3)!='Lynchs-PF-2_S108_R1' &
sample_names(megan_GO_3)!='Lynchs-PF-3_S109_R1' &
sample_names(megan_GO_3)!='Lynchs-PF-4_S110_R1' &
sample_names(megan_GO_3)!='Myrmidon-PF-1_S111_R1' &
sample_names(megan_GO_3)!='Myrmidon-PF-2_S112_R1' &
sample_names(megan_GO_3)!='Myrmidon-PF-3_S113_R1' &
sample_names(megan_GO_3)!='Myrmidon-PF-4_S114_R1' &
sample_names(megan_GO_3)!='Rib-PF-1_S103_R1' &
sample_names(megan_GO_3)!='Rib-PF-2_S104_R1' &
sample_names(megan_GO_3)!='Rib-PF-3_S105_R1' &
sample_names(megan_GO_3)!='Rib-PF-4_S106_R1' &
sample_names(megan_GO_3)!='Broomfield-2_S50_R1')
# Checking the object again
megan_genus
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 2066 taxa and 194 samples ]
## sample_data() Sample Data: [ 194 samples by 40 sample variables ]
## tax_table() Taxonomy Table: [ 2066 taxa by 7 taxonomic ranks ]
megan_GO_5
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 7476 taxa and 194 samples ]
## sample_data() Sample Data: [ 194 samples by 40 sample variables ]
## tax_table() Taxonomy Table: [ 7476 taxa by 6 taxonomic ranks ]
megan_GO_4
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 5257 taxa and 194 samples ]
## sample_data() Sample Data: [ 194 samples by 40 sample variables ]
## tax_table() Taxonomy Table: [ 5257 taxa by 4 taxonomic ranks ]
megan_GO_3
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 706 taxa and 194 samples ]
## sample_data() Sample Data: [ 194 samples by 40 sample variables ]
## tax_table() Taxonomy Table: [ 706 taxa by 3 taxonomic ranks ]
After removing the non pre-filtered samples, further data filtering included removal of reads (1) annotated as eukaryotic or viral; and (2) rare/spurious reads. Data was then Center-Log-Ratio (hereinafter ‘CLR’) transformed for statistical analysis in the mixOmics R package.
We annotated a total of 1919 microbial taxa (lowest category: genus level). Reads that were annotated as Eukarya (729 taxa in total) and viruses (11 viral annotations) were excluded from the analysis. Further analysis was performed on a phyloseq object with prokaryotic annotations only, a total of 1179 bacterial and archaeal groups (Figure 2, Table 1).
# Before plotting the bar plots, I first need to prepare my objects
### Taxonomy info | at 'Genus' level
megan_genus_all <- megan_genus
megan_genus_TAX_all <- as.data.frame(megan_genus_all@tax_table)
# Plot admixture barplot - Domain level (and viruses)
cols_domain <- c("slategray3", # Archaea
"grey45", # Bacteria
"salmon", # Eukaryota
"violetred", # f__Mimiviridae
"steelblue3", # f__Phycodnaviridae
"lightsteelblue4", # f__Retroviridae
"seashell4") # o__Caudovirales
DOMAIN <- as.data.frame(megan_genus_all@otu_table) %>%
rownames_to_column("OTUs") %>% # I will need this later to add taxonomy info
left_join(megan_genus_TAX_all %>% rownames_to_column("OTUs")) %>% # adding taxonomy info
column_to_rownames("OTUs") %>%
group_by(Rank1) %>%
# Keeping only numerical values now
summarise_if(.predicate = function(x) is.numeric(x),
.funs = funs(sum)) # Computing sums
# Now relative abundances
DOMAIN_RA <- DOMAIN
for (i in 2:(ncol(DOMAIN_RA))) {
DOMAIN_RA[i] <- DOMAIN_RA[i] / sum(DOMAIN_RA[i])
}
barplots_domain <- DOMAIN_RA %>%
column_to_rownames("Rank1") %>%
t() %>%
as.data.frame() %>%
rownames_to_column("Sample_ID") %>%
reshape2::melt() %>%
left_join(metadata %>% rownames_to_column("Sample_ID")) %>%
# Plotting now!
ggplot(aes(x=Sample_ID, y=value, fill=variable))+
geom_bar(stat = "identity")+
scale_y_continuous(expand = c(0,0))+
facet_wrap(~Sampling_trip, scales = "free", nrow = 5)+
# facet_grid(~Sampling_trip, scales = "free_x", space = "free")+
scale_fill_manual(values = cols_domain)+
ylab("Relative abundance of taxa (at Domain level)")+
xlab("Reef sites")+
theme(axis.text.x = # element_blank(),
element_text(angle = 75, hjust = 1, size = 12),
#axis.ticks.x = element_blank(),
#axis.title.x = element_blank(),
strip.text = element_text(colour="black", size=12),
panel.grid = element_blank(),
panel.background = element_blank(),
legend.position = "right",
legend.title = element_blank(),
legend.text = element_text(size = 12))
barplots_domain
We see that we have eukaryotic reads, let’s see how many taxa?
megan_genus_bacteria <- subset_taxa(megan_genus, # Phyloseq object with all OTUs
Rank1=="d__Bacteria") # The phyloseq object with raw counts
megan_genus_archaea <- subset_taxa(megan_genus, # Phyloseq object with all OTUs
Rank1=="d__Archaea") # The phyloseq object with raw counts
megan_genus_PROKS <- merge_phyloseq(megan_genus_bacteria,
megan_genus_archaea) # Phyloseq object with Proks only
megan_genus_EUKS <- subset_taxa(megan_genus_all,
Rank1=="d__Eukaryota") # Phyloseq object with Euks only
knitr::kable(as.data.frame(cbind(as.character(ntaxa(megan_genus_EUKS)), as.character(ntaxa(megan_genus_bacteria)), as.character(ntaxa(megan_genus_archaea)), as.character(ntaxa(megan_genus_PROKS)))), caption = "Taxonomic breakdown", col.names = c("Eukaryota", "Bacteria", "Archaea", "Prokarya"))
| Eukaryota | Bacteria | Archaea | Prokarya |
|---|---|---|---|
| 774 | 1212 | 45 | 1257 |
If we compare with abundances of prokaryotes (which are the target for this study), are there any euks that are highly abundant?
megan_genus_all_with_euks <- import_biom("/home/markoterzin/Documents/PhD/Thesis/Chapter_2/Data_analysis/Seawater/testing_Igor_Antti_R_script/All_seawater_full_dataset/input_files/IMOS_MGD_Megan_biom_files_191_samples_and_Neg_controls_July_2024/IMOS-MGD_Seawater_full_dataset_Genera_191_samples_Neg_controls_July_2024.biom")
# Merging!
sample_data(megan_genus_all_with_euks) <- sample_data(metadata_neg_controls)
# Removing the PF samples and Broomfield 2 (seq failed for this rep, and we have the repeated sample for rep 2)
megan_genus_all_with_euks <- subset_samples(megan_genus_all_with_euks, sample_names(megan_genus_all_with_euks)!='Lynchs-PF-1_S107_R1' &
sample_names(megan_genus_all_with_euks)!='Lynchs-PF-2_S108_R1' &
sample_names(megan_genus_all_with_euks)!='Lynchs-PF-3_S109_R1' &
sample_names(megan_genus_all_with_euks)!='Lynchs-PF-4_S110_R1' &
sample_names(megan_genus_all_with_euks)!='Myrmidon-PF-1_S111_R1' &
sample_names(megan_genus_all_with_euks)!='Myrmidon-PF-2_S112_R1' &
sample_names(megan_genus_all_with_euks)!='Myrmidon-PF-3_S113_R1' &
sample_names(megan_genus_all_with_euks)!='Myrmidon-PF-4_S114_R1' &
sample_names(megan_genus_all_with_euks)!='Rib-PF-1_S103_R1' &
sample_names(megan_genus_all_with_euks)!='Rib-PF-2_S104_R1' &
sample_names(megan_genus_all_with_euks)!='Rib-PF-3_S105_R1' &
sample_names(megan_genus_all_with_euks)!='Rib-PF-4_S106_R1' &
sample_names(megan_genus_all_with_euks)!='Broomfield-2_S50_R1')
# Removing the non-annotated stuff!
megan_genus_all_anno_only <- subset_taxa(megan_genus_all_with_euks, Rank2!="NA")
# Getting relative abundances too
megan_genus_all_RA = transform_sample_counts(megan_genus_all_with_euks, function(x) x / sum(x) )
# Selecting the top 100 most abundant MAGs (based on RA data)
megan_genus_top200_RA_abund_with_euks <- taxa_sums(megan_genus_all_RA) %>%
sort(decreasing = TRUE) %>%
head(200) %>% # Taking the first X most abundant taxa.
# Change the number depending on how many Genera I want to look at
names()
# Making a new phyloseq object
megan_genus_top200_RA_with_euks <- prune_taxa(megan_genus_top200_RA_abund_with_euks, # These are the top 20
megan_genus_all_RA)
# Defining breaks - to make sure even very lowly abundant taxa will be visible!
# From Steve:
breaks=c(0,0.001,0.01,0.05,0.1,0.25,0.4,0.5,0.6,0.7,1)
# But I want to have less breaks
# breaks_5=c(0,0.001,0.1,0.3,0.7,1)
# Plot heatmap
left_join(otu_table(megan_genus_top200_RA_with_euks) %>% as.data.frame %>% rownames_to_column("OTU"),
tax_table(megan_genus_top200_RA_with_euks) %>% as.data.frame %>% rownames_to_column("OTU")) %>%
arrange(match(OTU, megan_genus_top200_RA_abund_with_euks)) %>% # Arranging by abundances here
unite(taxonomy, c(OTU, Rank1, Rank2, Rank3, Rank4, Rank5, Rank6, Rank7), sep = "; ") %>% # Adding Taxonomy info
gather(Sample_ID, Reads, -taxonomy) %>% # 'Reads' contains the Raw counts
# left_join(as.data.frame(sample_data(megan_genus_all_RA)) %>% rownames_to_column("Sample_ID")) %>% # Now joining with the metadata
left_join(metadata %>% rownames_to_column("Sample_ID")) %>%
# Ready to plot now!
ggplot(aes(x = Sample_ID, # Short reef names on the x axis
y = reorder(taxonomy, # Taxonomy info on the y axis
Reads), # With Taxa ordered based on abundances, most abundant listed first
fill = Reads)) + # Change to 'Reads' if plotting the raw counts
geom_tile() + # This colors the heatmap in blue & makes the more abundant taxa darker in color
facet_grid(cols = vars(Sampling_trip), scales = "free_x", space = "free") + # Splitting in facets
theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 8)) + # Rotating the text at 90 degrees angle
# Use this if I want a smaller number of breaks
# scale_fill_stepsn(breaks = breaks_5, colours =c("white", # for the 0-0.001 RA range!
# "slategray1", "slategray2", "slategray3", "slategray4")) +
# Or from Steve:
scale_fill_stepsn(breaks = breaks, colours =c("white", # for the 0-0.001 RA range!
"slategray4", # 001 - 0.01
"slategray3", # 0.01 - 0.05
"slategray2", # 0.05 - 0.1
"navajowhite", # 0.1 - 0.25
"rosybrown2", # 0.25,0.4
"lightsalmon", # 0.4 - 0.5
"rosybrown1", # 0.5 - 0.6
# "lightgoldenrod1", # 0.6 - 0.7
"indianred2")) + # 0.7,1
scale_x_discrete(expand = c(0, 0)) +
scale_y_discrete(expand = c(0, 0))
Prior to removal of rare and spurious reads, non-annotated reads were removed from the dataset. We then computed relative abundance values (RA) and removed reads with average RA < 0.0001% across samples. After removing OTUs that were less than 0.0001% abundant, we retained 618 taxa (primarily at Genus level) out of the initial 1179 prokaryotic OTUs. At functional level, we retained 5015 GO annotations (out of 8689 GO terms).
### IMPORTANT - ***Change this part of the script*** depending on which phyloseq object I would like to look at: prokaryotic, eukaryotic or all. This way I wouldn't need to modify multiple lines in the script below
megan_genus <- megan_genus_PROKS # options to choose from: megan_genus_PROKS, megan_genus_EUKS
# Cleaning the names here already! This way I will make sure every other phyloseq object will have organised taxonomy
megan_genus_TAX_PROKS <- as.data.frame(megan_genus@tax_table)
# Unite the names within one column called "Taxonomy"
megan_genus_TAX_PROKS <- megan_genus_TAX_PROKS %>%
unite(Taxonomy, c(Rank1, Rank2, Rank3, Rank4, Rank5, Rank6, Rank7), sep = "; ") # Adding Taxonomy info
# Initialize empty columns
megan_genus_TAX_PROKS$Domain <- NA
megan_genus_TAX_PROKS$Phylum <- NA
megan_genus_TAX_PROKS$Class <- NA
megan_genus_TAX_PROKS$Order <- NA
megan_genus_TAX_PROKS$Family <- NA
megan_genus_TAX_PROKS$Genus <- NA
megan_genus_TAX_PROKS$Species <- NA
# Categorise taxonomic strings based on patterns:
megan_genus_TAX_PROKS$Domain <- str_match(megan_genus_TAX_PROKS$Taxonomy, "^d__(.+?);")[, 2]
megan_genus_TAX_PROKS$Phylum <- str_match(megan_genus_TAX_PROKS$Taxonomy, "; p__(.+?);")[, 2]
megan_genus_TAX_PROKS$Class <- str_match(megan_genus_TAX_PROKS$Taxonomy, "; c__(.+?);")[, 2]
megan_genus_TAX_PROKS$Order <- str_match(megan_genus_TAX_PROKS$Taxonomy, "; o__(.+?);")[, 2]
megan_genus_TAX_PROKS$Family <- str_match(megan_genus_TAX_PROKS$Taxonomy, "; f__(.+?);")[, 2]
megan_genus_TAX_PROKS$Genus <- str_match(megan_genus_TAX_PROKS$Taxonomy, "; g__(.+?);")[, 2]
megan_genus_TAX_PROKS$Species <- str_match(megan_genus_TAX_PROKS$Taxonomy, "; s__(.+?);")[, 2]
# Last thing: replacing missing values with "Unknown_*"
megan_genus_TAX_PROKS$Domain[is.na(megan_genus_TAX_PROKS$Domain)] <- "Unknown Domain"
megan_genus_TAX_PROKS$Phylum[is.na(megan_genus_TAX_PROKS$Phylum)] <- "Unknown Phylum"
megan_genus_TAX_PROKS$Class[is.na(megan_genus_TAX_PROKS$Class)] <- "Unknown Class"
megan_genus_TAX_PROKS$Order[is.na(megan_genus_TAX_PROKS$Order)] <- "Unknown Order"
megan_genus_TAX_PROKS$Family[is.na(megan_genus_TAX_PROKS$Family)] <- "Unknown Family"
megan_genus_TAX_PROKS$Genus[is.na(megan_genus_TAX_PROKS$Genus)] <- "Unknown Genus"
megan_genus_TAX_PROKS$Species[is.na(megan_genus_TAX_PROKS$Species)] <- "Unknown Species"
# Remove the original taxonomy column
megan_genus_TAX_PROKS <- megan_genus_TAX_PROKS %>%
dplyr::select(Domain, Phylum, Class, Order, Family, Genus, Species)
# All cleaned up! :) thanks ChatGPT
### Putting this back into the phyloseq object:
# First checking the current taxonomic names in phyloseq object
current_taxa_names <- taxa_names(megan_genus)
# Compare with tax_table column names and order
polished_tax_table <- colnames(t(megan_genus_TAX_PROKS))
# Check if they match
if (!identical(current_taxa_names, polished_tax_table)) {
stop("Polished taxonomic names in megan_genus_TAX_PROKS do not match the taxa_names in the megan_genus phyloseq object.")
}
# Looks like they match! So I'm not sure why I cannot merge them (code below)
# Check dimensions of tax_table and physeq - does the number of rows match?
nrow_tax_table <- nrow(megan_genus_TAX_PROKS)
ntaxa_physeq <-ntaxa(megan_genus) # Number of taxa in physeq
if (nrow_tax_table != ntaxa_physeq) {
stop("Number of rows in megan_genus_TAX_PROKS does not match the number of taxa in megan_genus phyloseq object.")
}
# Step 3: Compare order of unique values
identical(megan_genus_TAX_PROKS %>% # Looking for OTU order for the polishes taxa
rownames_to_column("OTUs") %>%
dplyr::select("OTUs"),
megan_genus@tax_table %>% # Looking for OTU order in the current phyloseqq object
as.data.frame() %>%
rownames_to_column("OTUs") %>%
dplyr::select("OTUs"))
## [1] TRUE
# Again, this is also the same
# Here too:
identical(row.names(megan_genus_TAX_PROKS),
row.names(otu_table(megan_genus))
)
## [1] TRUE
# Yes, the row names are identical
# Now adding this polished taxonomy to my phyloseq object:
tax_table(megan_genus) <- as.matrix(megan_genus_TAX_PROKS)
### Removing the negative controls too - taxa phyloseq object:
megan_genus_no_neg_control <- subset_samples(megan_genus,
sample_names(megan_genus)!='Neg-control-1_S101_R1' &
sample_names(megan_genus)!='Neg-control-2_S24_R1' &
sample_names(megan_genus)!='Neg-control-3_S116_R1')
### Removing the negative controls too - GOs at rank 5 phyloseq object:
megan_GO_5_no_neg_control <- subset_samples(megan_GO_5,
sample_names(megan_GO_5)!='Neg-control-1_S101_R1' &
sample_names(megan_GO_5)!='Neg-control-2_S24_R1' &
sample_names(megan_GO_5)!='Neg-control-3_S116_R1')
### Removing the negative controls too - GOs at rank 4 phyloseq object:
megan_GO_4_no_neg_control <- subset_samples(megan_GO_4,
sample_names(megan_GO_4)!='Neg-control-1_S101_R1' &
sample_names(megan_GO_4)!='Neg-control-2_S24_R1' &
sample_names(megan_GO_4)!='Neg-control-3_S116_R1')
### Removing the negative controls too - GOs at rank 3 phyloseq object:
megan_GO_3_no_neg_control <- subset_samples(megan_GO_3,
sample_names(megan_GO_3)!='Neg-control-1_S101_R1' &
sample_names(megan_GO_3)!='Neg-control-2_S24_R1' &
sample_names(megan_GO_3)!='Neg-control-3_S116_R1')
### Instead of setting an arbitrary threshold (e.g 100 seqs), I would like to filter based on relative abundances (***removing all OTUs < 0.0001% rel. abundance***)
# Tutorial I used: https://joey711.github.io/phyloseq/preprocess.html
### Removing reads that are annotated at Bacteria or Archaea levels only - not informative!
megan_genus_anno_only <- subset_taxa(megan_genus_no_neg_control, Domain!="NA")
### Removing reads that were not annotated at Rank2 level - not informative!
megan_GO_5_anno_only <- subset_taxa(megan_GO_5_no_neg_control, Rank2!="NA")
megan_GO_4_anno_only <- subset_taxa(megan_GO_4_no_neg_control, Rank2!="NA")
megan_GO_3_anno_only <- subset_taxa(megan_GO_3_no_neg_control, Rank2!="NA")
# Getting the taxa data frame
megan_genus_TAX <- as.data.frame(megan_genus_anno_only@tax_table)
# Getting the taxa data frame
megan_GO_5_FUN <- as.data.frame(megan_GO_5_anno_only@tax_table)
megan_GO_4_FUN <- as.data.frame(megan_GO_4_anno_only@tax_table)
megan_GO_3_FUN <- as.data.frame(megan_GO_3_anno_only@tax_table)
### Getting the relative abundances
# Taxa
megan_genus_RA = transform_sample_counts(megan_genus_anno_only, function(x) x / sum(x) )
# GO terms
megan_GO_5_RA = transform_sample_counts(megan_GO_5_anno_only, function(x) x / sum(x) )
megan_GO_4_RA = transform_sample_counts(megan_GO_4_anno_only, function(x) x / sum(x) )
megan_GO_3_RA = transform_sample_counts(megan_GO_3_anno_only, function(x) x / sum(x) )
# removing all OTUs that are less than 0.0001% abundant
megan_genus_RA_no_rare = filter_taxa(megan_genus_RA, function(x) mean(x) > 1e-6, TRUE)
# removing all genes that are less than 0.0001% abundant
megan_GO_5_RA_no_rare = filter_taxa(megan_GO_5_RA, function(x) mean(x) > 1e-6, TRUE)
megan_GO_3_RA_no_rare = filter_taxa(megan_GO_3_RA, function(x) mean(x) > 1e-6, TRUE)
megan_GO_4_RA_no_rare = filter_taxa(megan_GO_4_RA, function(x) mean(x) > 1e-6, TRUE)
Before_after_filtering <- cbind(rbind(ntaxa(megan_genus), ntaxa(megan_genus_RA_no_rare)),
rbind(ntaxa(megan_GO_5), ntaxa(megan_GO_5_RA_no_rare))
# rbind(ntaxa(megan_COGs), ntaxa(megan_COGs_RA_no_rare))
) %>%
as.data.frame()
# Adding row names now
row.names(Before_after_filtering) <- c("Before filtering", "After filtering")
knitr::kable(Before_after_filtering, caption = "Removal of Rare/Spurious reads (< 0.0001% RA)", col.names = c("Taxa", "GO terms"), row.names = T)
| Taxa | GO terms | |
|---|---|---|
| Before filtering | 1257 | 7476 |
| After filtering | 561 | 4287 |
megan_genus_abundant <- prune_taxa(taxa_names(megan_genus_RA_no_rare), # List of OTUs after filtering
megan_genus_no_neg_control) # My phyloseq object with raw counts
megan_GO_5_abundant <- prune_taxa(taxa_names(megan_GO_5_RA_no_rare), # List of OTUs after filtering
megan_GO_5_no_neg_control) # My phyloseq object with raw counts
megan_GO_4_abundant <- prune_taxa(taxa_names(megan_GO_4_RA_no_rare), # List of OTUs after filtering
megan_GO_4_no_neg_control) # My phyloseq object with raw counts
megan_GO_3_abundant <- prune_taxa(taxa_names(megan_GO_3_RA_no_rare), # List of OTUs after filtering
megan_GO_3_no_neg_control) # My phyloseq object with raw counts
# CLR is the normalisation method suggested by the mixOmics R package for microbial data - a way to address missing values that are characteristic of microbial datasets. I need to remove missing values before doing the CLR normalisation - The geometric mean cannot be determined for sparse data without deleting, replacing or estimating the 0 count values. So I am introducing pseudo counts
### Tutorial used: http://mixomics.org/mixmc/mixmc-preprocessing/
# Checking if there are any zeros - BEFORE adding pseudocounts
sum(which(megan_genus_abundant@otu_table == 0))
## [1] 3698620822
sum(which(megan_GO_5_abundant@otu_table == 0))
## [1] 43240551782
# sum(which(megan_COGs_abundant@otu_table == 0))
# Pseudocounts - replacing all zero vals with 1;
megan_genus_abundant@otu_table <- megan_genus_abundant@otu_table + 1
megan_GO_5_abundant@otu_table <- megan_GO_5_abundant@otu_table +1
megan_GO_3_abundant@otu_table <- megan_GO_3_abundant@otu_table +1
megan_GO_4_abundant@otu_table <- megan_GO_4_abundant@otu_table +1
# megan_COGs_abundant@otu_table <- megan_COGs_abundant@otu_table + 1
# Checking if there are any zeros - AFTER adding pseudocounts
sum(which(megan_genus_abundant@otu_table == 0))
## [1] 0
sum(which(megan_GO_5_abundant@otu_table == 0))
## [1] 0
# sum(which(megan_COGs_abundant@otu_table == 0))
# All good! No NAs after introducing pseudocounts
### Now I can CLR transform when running analyses in mixOmics!
# I am using an option from the microbiome R package, not the same as in MixOmics.
megan_genus_clr <- microbiome::transform(megan_genus_abundant, "clr")
megan_go_clr_5 <- microbiome::transform(megan_GO_5_abundant, "clr")
megan_go_clr_3 <- microbiome::transform(megan_GO_3_abundant, "clr")
megan_go_clr_4 <- microbiome::transform(megan_GO_4_abundant, "clr")
# megan_COGs_clr <- microbiome::transform(megan_COGs_abundant, "clr")
# megan_go_clr_3_bp <- megan_go_clr_3 %>%
# subset_taxa(Rank2 == 'GO:0008150 biological_process')
# megan_go_clr_4_bp <- megan_go_clr_4 %>%
# subset_taxa(Rank2 == 'GO:0008150 biological_process')
# But for GO at lvl 3, I only want bio process
# save.image("/home/markoterzin/Documents/PhD/Thesis/Chapter_2/Paper_drafts/the_analysis_is_finalised/Code/Code_for_Terzin_et_al_Microbial_Function_Outperforms_Taxonomy_in_Inferring_Water_Chemistry_across_the_Great_Barrier_Reef.RData")
# Preparing the object to have taxa names on boxplots
OTUs_biplot <- as.data.frame(megan_genus_clr@otu_table) %>%
t() # mixOmics needs samples and microbes to be reordered, so transposing here
# Check dimensions of data
dim(OTUs_biplot)
## [1] 191 561
class(OTUs_biplot)
## [1] "matrix" "array"
# Getting taxa names
OTUs_biplot_colnames_for_biplot <- left_join(otu_table(megan_genus_clr) %>%
as.data.frame %>%
rownames_to_column("OTU"),
tax_table(megan_genus_clr) %>%
as.data.frame %>%
rownames_to_column("OTU")) %>%
unite(taxonomy, c(Family, Genus), sep = "; ") # Adding Taxonomy info
## Joining, by = "OTU"
OTUs_biplot_colnames_for_biplot <- OTUs_biplot_colnames_for_biplot %>%
dplyr::select("OTU", "taxonomy")
# Merging with the OTUs_biplot object
OTUs_biplot_names <- left_join(t(OTUs_biplot) %>%
as.data.frame() %>%
rownames_to_column("OTU"),
OTUs_biplot_colnames_for_biplot) %>%
unite(Annotations, c(OTU, taxonomy), sep = "_") %>%
column_to_rownames("Annotations") %>% # moving this as rowposing back into the right format
t() # trans
# PCA
result.pca.taxa.names <- pca(OTUs_biplot_names)
# Plotting the PCA sample plot
plotIndiv(result.pca.taxa.names,
group = sample_data(megan_genus_abundant)$Sampling_trip,
title = 'PCA | Microbial Taxonomy',
legend = T,
ellipse = TRUE,
ind.names = F,
col.per.group =c("indianred", # Sampling trip 1
"indianred4", # Sampling trip 2
"red3", # Sampling trip 3
"slateblue"), # Sampling trip 4
legend.title = 'Sampling trip'
)
# Plotting the PCA biplot
biplot(result.pca.taxa.names,
comp = c(1, 2),
group = sample_data(megan_genus_abundant)$Sampling_trip,
ind.names = F,
ellipse = T,
col.per.group =c("indianred", # Sampling trip 1
"indianred4", # Sampling trip 2
"red3", # Sampling trip 3
"slateblue"), # Sampling trip 4
legend = TRUE,
vline = T,
hline = T,
cutoff = 0.65,
legend.title = "Sampling trip")
# Parameter tuning
tune.pca.taxa <- tune.pca(OTUs_biplot_names, ncomp = 10, scale = TRUE)
plot(tune.pca.taxa)
Screeplot from the PCA performed on the IMOS GBR-MGD metagenomics data (microbial taxonomy): Amount of explained variance for each principal component is shown. From the numerical output (shown bellow in tabular format), we observe that the first two principal components explain 60.31% of the total variance. The rule of thumb for choosing the number of PCA components is not so much to set a hard threshold based on the cumulative proportion of explained variance (as this is data-dependent), but to observe when a drop, or elbow, appears on the screeplot. The elbow indicates that the remaining variance is spread over many principal components and is not relevant in obtaining a low-dimensional ‘snapshot’ of the data. Based on this, we chose to keep two PCA dimensions.
# Numerical output
pca.taxa.num <- pca(OTUs_biplot_names, # getting the numerical values only
ncomp = 10,
center = TRUE,
scale = TRUE)
# Explained variance per PCA component
knitr::kable(pca.taxa.num$prop_expl_var$X, caption = "The proportion of explained variance per each PCA component is:")
| x | |
|---|---|
| PC1 | 0.2030513 |
| PC2 | 0.1235319 |
| PC3 | 0.0686204 |
| PC4 | 0.0537983 |
| PC5 | 0.0456617 |
| PC6 | 0.0431242 |
| PC7 | 0.0303674 |
| PC8 | 0.0226126 |
| PC9 | 0.0216532 |
| PC10 | 0.0178213 |
# The cumulative proportion of variance explained by each PCA component
knitr::kable(pca.taxa.num$cum.var, caption = "The cumulative proportion of variance explained by each PCA component")
| x | |
|---|---|
| PC1 | 0.2030513 |
| PC2 | 0.3265833 |
| PC3 | 0.3952037 |
| PC4 | 0.4490020 |
| PC5 | 0.4946637 |
| PC6 | 0.5377879 |
| PC7 | 0.5681553 |
| PC8 | 0.5907679 |
| PC9 | 0.6124211 |
| PC10 | 0.6302423 |
# Preparing the object to have taxa names on boxplots
GOs_biplot <- as.data.frame(megan_go_clr_5@otu_table) %>%
t() # mixOmics needs samples and microbes to be reordered, so transposing here
# Check dimensions of data
dim(GOs_biplot)
## [1] 191 4287
class(GOs_biplot)
## [1] "matrix" "array"
# Getting gene names
GOs_biplot_colnames_for_biplot <- left_join(otu_table(megan_go_clr_5) %>%
as.data.frame %>%
rownames_to_column("OTU"),
tax_table(megan_go_clr_5) %>%
as.data.frame %>%
rownames_to_column("OTU")) %>%
unite(Gene_annotations, c(Rank4), sep = "; ") # Adding Taxonomy info
## Joining, by = "OTU"
GOs_biplot_colnames_for_biplot <- GOs_biplot_colnames_for_biplot %>%
dplyr::select("OTU", "Gene_annotations")
# Merging with the OTUs_biplot object
GOs_biplot_names <- left_join(t(GOs_biplot) %>%
as.data.frame() %>%
rownames_to_column("OTU"),
GOs_biplot_colnames_for_biplot) %>%
unite(Annotations, c(OTU, Gene_annotations), sep = "_") %>%
column_to_rownames("Annotations") %>% # moving this as rowposing back into the right format
t() # trans
# PCA
result.pca.GOs.names <- pca(GOs_biplot_names)
# Plotting the PCA sample plot
plotIndiv(result.pca.GOs.names,
group = sample_data(megan_GO_5_abundant)$Sampling_trip,
title = 'PCA | Microbial Functions',
legend = T,
ellipse = TRUE,
ind.names = F,
col.per.group =c("indianred", # Sampling trip 1
"indianred4", # Sampling trip 2
"red3", # Sampling trip 3
"slateblue"), # Sampling trip 4
legend.title = 'Sampling trip'
)
# Plotting the PCA biplot
biplot(result.pca.GOs.names,
comp = c(1, 2),
group = sample_data(megan_GO_5_abundant)$Sampling_trip,
ind.names = F,
ellipse = T,
col.per.group =c("indianred", # Sampling trip 1
"indianred4", # Sampling trip 2
"red3", # Sampling trip 3
"slateblue"), # Sampling trip 4
legend = TRUE,
vline = T,
hline = T,
cutoff = 0.95,
legend.title = "Sampling trip")
# Parameter tuning
tune.pca.GOs <- tune.pca(GOs_biplot_names, ncomp = 10, scale = TRUE)
plot(tune.pca.GOs)
Screeplot from the PCA performed on the IMOS GBR-MGD metagenomics data (microbial genes - GO terms): Amount of explained variance for each principal component is shown. From the numerical output (shown bellow in tabular format), we observe that the first two principal components explain 60.31% of the total variance. The rule of thumb for choosing the number of PCA components is not so much to set a hard threshold based on the cumulative proportion of explained variance (as this is data-dependent), but to observe when a drop, or elbow, appears on the screeplot. The elbow indicates that the remaining variance is spread over many principal components and is not relevant in obtaining a low-dimensional ‘snapshot’ of the data. Based on this, we chose to keep two PCA dimensions.
# Numerical output
pca.GOs.num <- pca(GOs_biplot_names, # getting the numerical values only
ncomp = 10,
center = TRUE,
scale = TRUE)
# Explained variance per PCA component
knitr::kable(pca.GOs.num$prop_expl_var$X, caption = "The proportion of explained variance per each PCA component is:")
| x | |
|---|---|
| PC1 | 0.3436569 |
| PC2 | 0.1184468 |
| PC3 | 0.0592101 |
| PC4 | 0.0477584 |
| PC5 | 0.0386741 |
| PC6 | 0.0292849 |
| PC7 | 0.0203148 |
| PC8 | 0.0186514 |
| PC9 | 0.0143016 |
| PC10 | 0.0118723 |
# The cumulative proportion of variance explained by each PCA component
knitr::kable(pca.GOs.num$cum.var, caption = "The cumulative proportion of variance explained by each PCA component")
| x | |
|---|---|
| PC1 | 0.3436569 |
| PC2 | 0.4621037 |
| PC3 | 0.5213138 |
| PC4 | 0.5690722 |
| PC5 | 0.6077463 |
| PC6 | 0.6370312 |
| PC7 | 0.6573461 |
| PC8 | 0.6759974 |
| PC9 | 0.6902990 |
| PC10 | 0.7021713 |
Parameter tuning in mixOmics to identify the optimal number of principal components (PCs) showed that the variance explained by adding more than 2 PCs is insignificant for both taxonomy and function. Hence, 2 PCs were retained. PCA clustering identified a clear difference between summer and winter samples, for both taxonomy and function. However, this clustering becomes more evident at functional compared to taxonomic levels. The percentage of variance explained by the first 2 Principal components (PCs) equaled to ~26% for taxonomy, and 55% for functions (GO terms). A PERMANOVA test was then carried out to investigate which comparisons are statistically significant.
Analysis of similarities (ANOSIM) testing whether there is a statistically significant difference between two or more groups of sampling units - sampling trips. We will then perform a Pairwise PERMANOVA.
taxa.anosim <- left_join(otu_table(megan_genus_RA_no_rare) %>%
as.data.frame %>%
rownames_to_column("OTU"),
megan_genus_TAX %>%
rownames_to_column("OTU")) %>%
unite(taxonomy, c(OTU, Domain, Phylum, Class, Order, Family, Genus, Species), sep = "; ") %>%
column_to_rownames("taxonomy")
# Removing rows with NAs, because ANOSIM does not take in missing vals
taxa.anosim <- na.omit(taxa.anosim)
# Object is ready to perform the test
ano_taxa <- anosim(t(taxa.anosim),
sample_data(megan_genus_RA_no_rare)$Sampling_trip,
distance = "bray",
permutations = 9999)
# Results
ano_taxa
##
## Call:
## anosim(x = t(taxa.anosim), grouping = sample_data(megan_genus_RA_no_rare)$Sampling_trip, permutations = 9999, distance = "bray")
## Dissimilarity: bray
##
## ANOSIM statistic R: 0.2459
## Significance: 1e-04
##
## Permutation: free
## Number of permutations: 9999
Pairwise PERMANOVA - taxa
Phylum level
Out of the 29 bacterial and archaeal phyla we identified, the most abundant phyla were Cyanobacteria, Proteobacteria, and Bacteroidetes, respectively. Bacteroidetes increased in abundance for those samples collected during the peak of summer (February 2020), and were lowest in abundances during winter (July 2020).
## Joining with `by = join_by(OTU)`
## Joining with `by = join_by(Sample_ID)`
## [1] 1
| Group.1 | x |
|---|---|
| Unknown Phylum | 0.4727718 |
| Cyanobacteria | 0.3668856 |
| Proteobacteria | 0.1317830 |
| Bacteroidetes | 0.0126634 |
| Actinobacteria | 0.0079718 |
| Planctomycetes | 0.0024819 |
| Firmicutes | 0.0020133 |
| Verrucomicrobia | 0.0010665 |
| Balneolaeota | 0.0009227 |
| Thaumarchaeota | 0.0003119 |
| Spirochaetes | 0.0002799 |
| Euryarchaeota | 0.0002221 |
| Candidatus Thermoplasmatota | 0.0001400 |
| Fusobacteria | 0.0000869 |
| Rhodothermaeota | 0.0000807 |
| Lentisphaerae | 0.0000689 |
| Tenericutes | 0.0000574 |
| Nitrospinae | 0.0000417 |
| Candidatus Tectomicrobia | 0.0000289 |
| Acidobacteria | 0.0000283 |
| Nitrospirae | 0.0000279 |
| Chloroflexi | 0.0000222 |
| Deinococcus-Thermus | 0.0000116 |
| Candidatus Peregrinibacteria | 0.0000088 |
| Chlamydiae | 0.0000069 |
| Gemmatimonadetes | 0.0000057 |
| Candidatus Kaiserbacteria | 0.0000054 |
| Candidatus Marinimicrobia | 0.0000026 |
| Fibrobacteres | 0.0000018 |
## [1] 4
| Group.1 | Trip_01_Nov-Dec_2019 | Trip_02_January_2020 | Trip_03_February_2020 | Trip_04_July_2020 |
|---|---|---|---|---|
| Acidobacteria | 0.0000001 | 0.0000393 | 0.0000321 | 0.0000382 |
| Actinobacteria | 0.0093240 | 0.0104277 | 0.0067635 | 0.0057322 |
| Bacteroidetes | 0.0114547 | 0.0091737 | 0.0265800 | 0.0059184 |
| Balneolaeota | 0.0014112 | 0.0009432 | 0.0011873 | 0.0003183 |
| Candidatus Kaiserbacteria | 0.0000001 | 0.0000210 | 0.0000002 | 0.0000002 |
| Candidatus Marinimicrobia | 0.0000108 | 0.0000001 | 0.0000002 | 0.0000002 |
| Candidatus Peregrinibacteria | 0.0000085 | 0.0000195 | 0.0000079 | 0.0000006 |
| Candidatus Tectomicrobia | 0.0000001 | 0.0000246 | 0.0000041 | 0.0000741 |
| Candidatus Thermoplasmatota | 0.0000147 | 0.0001967 | 0.0000164 | 0.0002847 |
| Chlamydiae | 0.0000002 | 0.0000264 | 0.0000003 | 0.0000004 |
| Chloroflexi | 0.0000001 | 0.0000265 | 0.0000250 | 0.0000335 |
| Cyanobacteria | 0.3425560 | 0.3769217 | 0.3654358 | 0.3785127 |
| Deinococcus-Thermus | 0.0000121 | 0.0000344 | 0.0000003 | 0.0000004 |
| Euryarchaeota | 0.0001920 | 0.0002252 | 0.0002329 | 0.0002349 |
| Fibrobacteres | 0.0000001 | 0.0000065 | 0.0000002 | 0.0000002 |
| Firmicutes | 0.0019127 | 0.0020475 | 0.0018850 | 0.0021616 |
| Fusobacteria | 0.0001014 | 0.0001372 | 0.0000967 | 0.0000250 |
| Gemmatimonadetes | 0.0000002 | 0.0000107 | 0.0000128 | 0.0000004 |
| Lentisphaerae | 0.0000604 | 0.0000589 | 0.0001671 | 0.0000087 |
| Nitrospinae | 0.0000137 | 0.0000248 | 0.0000499 | 0.0000720 |
| Nitrospirae | 0.0000144 | 0.0000409 | 0.0000229 | 0.0000311 |
| Planctomycetes | 0.0023251 | 0.0027011 | 0.0021333 | 0.0026849 |
| Proteobacteria | 0.1309415 | 0.1327437 | 0.1313794 | 0.1319308 |
| Rhodothermaeota | 0.0000642 | 0.0001129 | 0.0001349 | 0.0000244 |
| Spirochaetes | 0.0001831 | 0.0002533 | 0.0002024 | 0.0004384 |
| Tenericutes | 0.0000861 | 0.0000005 | 0.0001653 | 0.0000008 |
| Thaumarchaeota | 0.0002969 | 0.0003423 | 0.0002543 | 0.0003419 |
| Unknown Phylum | 0.4978213 | 0.4623094 | 0.4622533 | 0.4701345 |
| Verrucomicrobia | 0.0011939 | 0.0011304 | 0.0009563 | 0.0009964 |
go.anosim <- left_join(otu_table(megan_GO_5_RA_no_rare) %>%
as.data.frame %>%
rownames_to_column("OTU"),
megan_GO_5_FUN %>%
rownames_to_column("OTU")) %>%
unite(taxonomy, c(OTU, Rank1, Rank2, Rank3, Rank4, Rank5, Rank6#, Rank7, Rank8
), sep = "; ") %>%
column_to_rownames("taxonomy")
# Removing rows with NAs, because ANOSIM does not take in missing vals
go.anosim <- na.omit(go.anosim)
# Object is ready to perform the test
ano_go <- anosim(t(go.anosim),
sample_data(megan_GO_5_RA_no_rare)$Sampling_trip,
distance = "bray",
permutations = 9999)
# Results
ano_go
##
## Call:
## anosim(x = t(go.anosim), grouping = sample_data(megan_GO_5_RA_no_rare)$Sampling_trip, permutations = 9999, distance = "bray")
## Dissimilarity: bray
##
## ANOSIM statistic R: 0.3743
## Significance: 1e-04
##
## Permutation: free
## Number of permutations: 9999
Pairwise PERMANOVA - GO terms (rank 5)
# Compute the mantel tests - cite the source of where this is coming from!
multimantel<-function(distance,env.df,geo.dist){
BCdist<-distance
statistic<-NULL
pval<-NULL
n.obs<-NULL
for (i in 1:ncol(env.df)){
na.pos<-which(is.na(env.df[,i]))
if (length(na.pos)>0) tmp<-mantel.partial(as.dist(as.matrix(BCdist)[-c(na.pos),-c(na.pos)]),dist(env.df[-c(na.pos),i]),as.dist(as.matrix(geo.dist)[-c(na.pos),-c(na.pos)]),method = "pearson",permutations = 1000) else tmp<-mantel.partial(BCdist,dist(env.df[,i]),geo.dist,method = "pearson",permutations = 1000)
statistic<-c(statistic,tmp$statistic)
pval<-c(pval,tmp$signif)
n.obs<-c(n.obs,nrow(env.df)-length(na.pos))
}
data.frame(var=colnames(env.df),statistic,pval,p.corr=p.adjust(pval,method="bonferroni"),n.obs)
}
### Calculate Bray-Curtis dissimilarities - doing this on the Relative abundance data when rare taxa were excluded
# Taxonomy
megan_genus_dist <- vegdist(t(otu_table(megan_genus_RA_no_rare)), method = "bray")
# GO terms
megan_go_dist <- vegdist(t(otu_table(megan_GO_5_RA_no_rare)), method = "bray")
# Getting distances (in km) for IMOS-MGD sites - this is important because the Mantels will be corrected for geography
# Getting distances (in km) for IMOS-MGD sites
metadata_Mantel <- sample_data(megan_genus_clr) %>%
as.matrix() %>%
as.data.frame() %>%
rownames_to_column("Sample_ID")
# Importing the coordinates
map_coords_Mantel <- read.csv("/home/markoterzin/Documents/PhD/IMOS_reporting/IMOS_MGD_metadata/MARKO_for_eReefs_Lats_Longs.csv")
map_coords_Mantel <- left_join(metadata_Mantel[,c(1,2)], map_coords_Mantel, by = c("REEF_NAME" = "name"))
# map_coords <- map_coords %>% remove_rownames %>% column_to_rownames(var="Sample_ID")
map_coords_Mantel$REEF_NAME <- NULL
names(map_coords_Mantel)[names(map_coords_Mantel) == 'Sample_ID'] <- 'name'
# Setting first column as row names
map_coords_Mantel <- map_coords_Mantel %>%
remove_rownames %>%
column_to_rownames(var="name")
# Need to reorder as pointDistance() function requires longitude to go first
map_coords_reorder <- map_coords_Mantel %>%
relocate(lon, lat)
# Probably better to compute this 4 times for each of the trips, but first need to make sure that this code works
IMOS_mar.dist.mat <- round(pointDistance(map_coords_reorder, lonlat=TRUE) / 1000)
rownames(IMOS_mar.dist.mat) <- metadata_Mantel$Sample_ID
colnames(IMOS_mar.dist.mat) <- metadata_Mantel$Sample_ID
# Trick here: Now adding one column to the front so that I can make the correlation plot for both midshelf and offshore reefs
metadata_Mantel <- cbind(a = 0, metadata_Mantel)
# partial Mantels - microbial taxa
partial_Mantel_taxa_res <- multimantel(as.dist(as.matrix(megan_genus_dist)[metadata_Mantel$a=="0",metadata_Mantel$a=="0"]), # Distance object, doing it only
# for the epipelagic layer
metadata_Mantel[metadata_Mantel$a=="0", colnames(metadata_Mantel[,c(26:42)])], # columns 26-42 will extract numerical values
as.dist(as.matrix(IMOS_mar.dist.mat)[metadata_Mantel$a=="0", metadata_Mantel$a=="0"])) #[env.mat$epi=="EPI", env.mat$epi=="EPI"])) # I only need the geographic
# distances, in km
knitr::kable(partial_Mantel_taxa_res %>% arrange(abs(statistic)),
caption = "Partial Mantel tests assessing which physico-chemical parameters mat act as significant drivers of seawater microbiomes at the taxonomic level."
)
| var | statistic | pval | p.corr | n.obs |
|---|---|---|---|---|
| median_Si_µM | -0.0002110 | 0.4845155 | 1.0000000 | 191 |
| median_DOC_µM | -0.0028420 | 0.5484515 | 1.0000000 | 191 |
| median_NO3_µM | -0.0047510 | 0.5024975 | 1.0000000 | 191 |
| SALINITY_2.5m_RV | -0.0050375 | 0.5484515 | 1.0000000 | 191 |
| FLUORESCENCE_2.5m_RV | 0.0124093 | 0.3266733 | 1.0000000 | 191 |
| median_Chlorophyll_A_µg_L | 0.0340392 | 0.1928072 | 1.0000000 | 191 |
| median_TDN_µM | -0.0352454 | 0.8861139 | 1.0000000 | 191 |
| median_TSS_mg_L | -0.0376407 | 0.8111888 | 1.0000000 | 187 |
| median_Phaeophytin_A_µg_L | 0.0463218 | 0.1008991 | 1.0000000 | 191 |
| median_NO2_µM | 0.0769044 | 0.0039960 | 0.0679321 | 191 |
| median_NH4_µM | 0.0867594 | 0.0139860 | 0.2377622 | 191 |
| median_PP_µM | 0.1486216 | 0.0009990 | 0.0169830 | 187 |
| median_TDP_µM | 0.1989444 | 0.0009990 | 0.0169830 | 191 |
| median_POC_µM | 0.2200806 | 0.0009990 | 0.0169830 | 191 |
| median_PN_µM | 0.2362877 | 0.0009990 | 0.0169830 | 191 |
| SEAWATER_TEMPERATURE_2.5m_RV | 0.2674913 | 0.0009990 | 0.0169830 | 191 |
| median_PO4_µM | 0.2823732 | 0.0009990 | 0.0169830 | 191 |
# partial Mantels - microbial function (GO terms)
partial_Mantel_GOs_res <- multimantel(as.dist(as.matrix(megan_go_dist)[metadata_Mantel$a=="0",metadata_Mantel$a=="0"]), # Distance object, doing it only
# for the epipelagic layer
metadata_Mantel[metadata_Mantel$a=="0", colnames(metadata_Mantel[,c(26:42)])], # columns 26-42 will extract numerical values
as.dist(as.matrix(IMOS_mar.dist.mat)[metadata_Mantel$a=="0", metadata_Mantel$a=="0"])) #[env.mat$epi=="EPI", env.mat$epi=="EPI"])) # I only need the geographic
# distances, in km
knitr::kable(partial_Mantel_GOs_res %>% arrange(abs(statistic)),
caption = "Partial Mantel tests assessing which physico-chemical parameters mat act as significant drivers of seawater microbiomes at the functional level."
)
| var | statistic | pval | p.corr | n.obs |
|---|---|---|---|---|
| median_Si_µM | 0.0111393 | 0.3076923 | 1.0000000 | 191 |
| SALINITY_2.5m_RV | -0.0130414 | 0.6993007 | 1.0000000 | 191 |
| median_TSS_mg_L | -0.0242762 | 0.7142857 | 1.0000000 | 187 |
| median_TDN_µM | 0.0293237 | 0.1438561 | 1.0000000 | 191 |
| median_NO2_µM | 0.0304584 | 0.1388611 | 1.0000000 | 191 |
| median_NO3_µM | -0.0370669 | 0.8851149 | 1.0000000 | 191 |
| median_NH4_µM | 0.0511959 | 0.0459540 | 0.7812188 | 191 |
| FLUORESCENCE_2.5m_RV | 0.0532699 | 0.0279720 | 0.4755245 | 191 |
| median_Phaeophytin_A_µg_L | 0.0930029 | 0.0069930 | 0.1188811 | 191 |
| median_Chlorophyll_A_µg_L | 0.1126347 | 0.0019980 | 0.0339660 | 191 |
| median_DOC_µM | 0.1157628 | 0.0009990 | 0.0169830 | 191 |
| median_PP_µM | 0.1189701 | 0.0009990 | 0.0169830 | 187 |
| median_PN_µM | 0.2554285 | 0.0009990 | 0.0169830 | 191 |
| median_TDP_µM | 0.2583475 | 0.0009990 | 0.0169830 | 191 |
| median_PO4_µM | 0.2640840 | 0.0009990 | 0.0169830 | 191 |
| median_POC_µM | 0.2755923 | 0.0009990 | 0.0169830 | 191 |
| SEAWATER_TEMPERATURE_2.5m_RV | 0.2990635 | 0.0009990 | 0.0169830 | 191 |
# WQ
partial_Mantel_cor.mat_taxa_WQ <- data.frame(Taxonomy=partial_Mantel_taxa_res$statistic,
# GO_terms=IMOS_res_go_WQ$statistic, # Transcriptome=res.metaT$statistic,
row.names = partial_Mantel_taxa_res$var)
partial_Mantel_pcor.mat_taxa_WQ <- data.frame(Taxonomy=partial_Mantel_taxa_res$pval,
# GO_terms=IMOS_res_go_WQ$p.corr, # Expression=res.exp$pval,
row.names = partial_Mantel_taxa_res$var)# ,Transcriptome=res.metaT$pval)
# Ordering - highest correlations first
# WQ_ordre<-order(apply(IMOS_cor.mat_WQ[,1:2],1,mean),decreasing = T)
# WQ
partial_Mantel_cor.mat_GOs_WQ <- data.frame(Functions=partial_Mantel_GOs_res$statistic,
# GO_terms=IMOS_res_go_WQ$statistic, # Transcriptome=res.metaT$statistic,
row.names = partial_Mantel_GOs_res$var)
partial_Mantel_pcor.mat_GOs_WQ <- data.frame(Functions=partial_Mantel_GOs_res$pval,
# GO_terms=IMOS_res_go_WQ$p.corr, # Expression=res.exp$pval,
row.names = partial_Mantel_GOs_res$var)# ,Transcriptome=res.metaT$pval)
# Ordering - highest correlations first
# WQ_ordre<-order(apply(IMOS_cor.mat_WQ[,1:2],1,mean),decreasing = T)
# Let's visualise this! as heatmaps:
# Taxonomy
heatmap_partial_Mantels_taxa_WQ <- ggcorrplot(partial_Mantel_cor.mat_taxa_WQ,#[ordre,], # Strongest drivers first
p.mat=partial_Mantel_pcor.mat_taxa_WQ,#[ordre,], # Strongest drivers first
insig = "blank",
sig.level = 0.05,
method = "square",
lab=T,
lab_size = 2.5,
colors=c("#2874b2","white","#ba2832"))
heatmap_partial_Mantels_taxa_WQ
# Functions
heatmap_partial_Mantels_GOs_WQ <- ggcorrplot(partial_Mantel_cor.mat_GOs_WQ,#[ordre,], # Strongest drivers first
p.mat=partial_Mantel_pcor.mat_GOs_WQ,#[ordre,], # Strongest drivers first
insig = "blank",
sig.level = 0.05,
method = "square",
lab=T,
lab_size = 2.5,
colors=c("#2874b2","white","#ba2832"))
heatmap_partial_Mantels_GOs_WQ
# Merging the two
# patchwork::wrap_plots(heatmap_partial_Mantels_taxa_WQ,
# heatmap_partial_Mantels_GOs_WQ,
# nrow = 2,
# ncol = 1)